#!/bin/sh
#
#
# SGE startup script
#
#___INFO__MARK_BEGIN__
##########################################################################
#
#  The Contents of this file are made available subject to the terms of
#  the Sun Industry Standards Source License Version 1.2
#
#  Sun Microsystems Inc., March, 2001
#
#
#  Sun Industry Standards Source License Version 1.2
#  =================================================
#  The contents of this file are subject to the Sun Industry Standards
#  Source License Version 1.2 (the "License"); You may not use this file
#  except in compliance with the License. You may obtain a copy of the
#  License at http://gridengine.sunsource.net/Gridengine_SISSL_license.html
#
#  Software provided under this License is provided on an "AS IS" basis,
#  WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
#  WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS,
#  MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING.
#  See the License for the specific provisions governing your rights and
#  obligations concerning the Software.
#
#  The Initial Developer of the Original Code is: Sun Microsystems, Inc.
#
#  Copyright: 2001 by Sun Microsystems, Inc.
#
#  All Rights Reserved.
#
##########################################################################
#___INFO__MARK_END__

#
# This script can be called with the following arguments:
#
#       start       start qmaster or shadowd 
#       stop        Terminates qmaster if we are on the master machine.
#       restart     equivalent to stop followed by start
#       -qmaster    only starts qmaster 
#       -shadowd    start shadwod if found in the "shadow_masters" file
#       -migrate    shuts down qmaster if it is running
#                   on another host and start the daemons on this host
#
# If the file "primary_qmaster" in the $SGE_ROOT/$SGE_CELL/common
# exists and it contains the hostname of the current machine and qmaster
# is running on another host it will be shut down and started on this host
#
# Unix commands which may be used in this script:
#    cat cut tr ls grep awk sed basename
#
# This script requires the script $SGE_ROOT/util/arch
#

PATH=/bin:/usr/bin:/sbin:/usr/sbin

#---------------------------------------------------------------------------
# The following lines provide the necessary info for adding a startup script
# according to the Linux Standard Base Specification (LSB) which can
# be found at:
#
#    http://www.linuxfoundation.org/spec/booksets/LSB-Core-generic/LSB-Core-generic/initscrcomconv.html
#
### BEGIN INIT INFO
# Provides:       sgemaster.<%= @cluster_name %>
# Required-Start: $network $remote_fs
# Required-Stop:
# Default-Start:  3 5
# Default-Stop: 0 1 2 6
# Description:  start Grid Engine qmaster, shadowd
### END INIT INFO
# chkconfig: 35 95 3
#---------------------------------------------------------------------------

SGE_ROOT=/opt/sge; export SGE_ROOT
SGE_CELL=default; export SGE_CELL
SGE_QMASTER_PORT=6444; export SGE_QMASTER_PORT 
SGE_EXECD_PORT=6445; export SGE_EXECD_PORT

unset CODINE_ROOT GRD_ROOT COD_CELL GRD_CELL

ARCH=`$SGE_ROOT/util/arch`

# library path setting required only for architectures where RUNPATH is not supported
[ -d $SGE_ROOT/lib/$ARCH ] &&
case $ARCH in
sol*|lx*)
   ;;
*)
   shlib_path_name=`$SGE_ROOT/util/arch -lib`
   old_value=`eval echo '$'$shlib_path_name`
   if [ x$old_value = x ]; then
      eval $shlib_path_name=$SGE_ROOT/lib/$ARCH
   else
      eval $shlib_path_name=$old_value:$SGE_ROOT/lib/$ARCH
   fi
   export $shlib_path_name
   ;;
esac

#Include SMF if available
NO_SMF=1
if [ -f /lib/svc/share/smf_include.sh ]; then
   . /lib/svc/share/smf_include.sh
   smf_present
   NO_SMF=$?
fi

#---------------------------------------------------------------------------
# Shutdown
# Send SIGTERM to process name $1 with pid in file $2
#
Shutdown()
{
   name=$1
   pidfile=$2
   if [ -f $pidfile ]; then
      pid=`cat $pidfile`
      maxretries=20
      i=0
      while [ $i -lt $maxretries ]; do
         $utilbin_dir/checkprog $pid $name > /dev/null
         if [ "$?" = 0 ]; then
            #We keep killing Qmaster so that child processes get killed
            kill $pid
         else
            return 0
         fi
         sleep 2
         i=`expr $i + 1`

      done
      kill -9 $pid
      return $?
   fi
}


#---------------------------------------------------------------------------
# QmasterSpoolDir
#    Return qmasters spool directory
#
QmasterSpoolDir()
{
   qma_spool_dir=`grep qmaster_spool_dir \
                      $SGE_ROOT/$SGE_CELL/common/bootstrap | \
                      awk '{ print $2 }'`
   echo $qma_spool_dir
}

HostCompare()
{
   host1=$1
   host2=$2

   ignore_fqdn=true
   if [ -f $SGE_ROOT/$SGE_CELL/common/bootstrap ]; then
      ignore_fqdn_txt=`grep ignore_fqdn $SGE_ROOT/$SGE_CELL/common/bootstrap | awk '{print $2}'`
      case "$ignore_fqdn_txt" in
         [fF][aA][lL][sS][eE])
            ignore_fqdn=false
            ;;
      esac
   fi
   
   if [ "$ignore_fqdn" = true ]; then
      host1=`echo $host1 | cut -f 1 -d .`
      host2=`echo $host2 | cut -f 1 -d .`
   fi

   #translate hostname to lower, because hostname are case insensitive
   host1=`echo $host1 | tr "[A-Z]" "[a-z]"`
   host2=`echo $host2 | tr "[A-Z]" "[a-z]"`
   
   if [ "$host1" = "$host2" ]; then
      echo 0
   else
      echo 1
   fi
}


#---------------------------------------------------------------------------
# CheckIfQmasterHost
#    If our hostname given in $1 is the same as in the "act_qmaster" file
#    echo "true" else echo "false"
#
CheckIfQmasterHost()
{
   host=$1
   act_qmaster=`cat $SGE_ROOT/$SGE_CELL/common/act_qmaster`

   if [ `HostCompare $host $act_qmaster` -eq 0 ]; then
      echo true
   else
      echo false
   fi
}

#---------------------------------------------------------------------------
# CheckIfPrimaryQmasterHost
#    Check if our hostname given in $1 is the same as in the
#    "primary_qmaster" file
#    echo true if there is our hostname else echo false
#
CheckIfPrimaryQmasterHost()
{
   host=$1

   fname=$SGE_ROOT/$SGE_CELL/common/primary_qmaster

   if [ -f $fname ]; then
      primary_qmaster=`cat $fname`
      if [ `HostCompare $host $primary_qmaster` -eq 0 ]; then
         echo true
      else
         echo false
      fi
   else
      echo false
   fi
}


#---------------------------------------------------------------------------
# CheckIfShadowMasterHost
#    Check if our hostname given in $1 is contained in the
#    "shadow_masters" file
#    echo true if there is our hostname else echo false
#
CheckIfShadowMasterHost()
{
   host=$1

   fname=$SGE_ROOT/$SGE_CELL/common/shadow_masters

   if [ -f $fname ]; then      
      grep -i "^${host}$" $fname 2>&1 > /dev/null
      if [ $? = 0 ]; then
         shadow_host="true"
      else
         shadow_host="false"
      fi
   else
      shadow_host="false"
   fi
}

#---------------------------------------------------------------------------
# GetPathToBinaries
#    echo the name of the bin_dir on this system
#    The check is fulfilled if we can access the qstat binary
#    echo "none" if we can't determine the binary path
GetPathToBinaries()
{
   cfgname=$SGE_ROOT/$SGE_CELL/common/bootstrap

   base=none

   if [ -f $cfgname ]; then
      base=`grep binary_path $cfgname | awk '{ print $2 }'`
      if [ -f $base/qstat ]; then
         :
      elif [ -f $SGE_ROOT/util/arch ]; then
         arch=`$SGE_ROOT/util/arch`
         if [ -f $base/$arch/qstat ]; then
               base=$base/$arch
         fi
      fi
   fi

   echo $base
}


#---------------------------------------------------------------------------
# GetAdminUser
#    echo the name of the admin user on this system
#    echo "root" if admin user retrieval fails
GetAdminUser()
{
   cfgname=$SGE_ROOT/$SGE_CELL/common/bootstrap
   user=none

   if [ -f $cfgname ]; then
      user=`grep admin_user $cfgname | awk '{ print $2 }'`
   fi

   if [ `echo $user|tr "[A-Z]" "[a-z]"` = "none" ]; then
      user=root
   fi
   echo $user
}

#---------------------------------------------------------------------------
# GetPathToUtilbin
#    echo the path to the binaries in utilbin
#    The check is fulfilled if we can access the "gethostname" binary
#    echo "none" if we can't determine the binary path
#
GetPathToUtilbin()
{
   base=none

   if [ -f $SGE_ROOT/util/arch ]; then
      utilbindir=$SGE_ROOT/utilbin

      arch=`$SGE_ROOT/util/arch`
      if [ -f $utilbindir/$arch/gethostname ]; then
         base=$utilbindir/$arch
      fi
   fi

   echo $base
}

#---------------------------------------------------------------------------
# CheckRunningQmaster
# checks, if sge_qmaster is running
# In error case the sge_qmaster didn't start, silently
#
CheckRunningQmaster()
{
   masterhost=`cat $SGE_ROOT/$SGE_CELL/common/act_qmaster`

   if [ "$SGE_QMASTER_PORT" = "" ]; then
      ping_port=`$utilbin_dir/getservbyname -number sge_qmaster`
   else
      ping_port=$SGE_QMASTER_PORT
   fi

   start=`$SGE_ROOT/utilbin/$ARCH/now 2>/dev/null`

   running=false
   retries=0
   qping_timeout=false

   # qping may have a long timeout in case of network or hostname resolution
   # related problems.
   # ensure that the test for a running qmaster does not take too long
   # by limiting the total time and numbers the connection test is repeated
   # we also require that the qmaster created a PID file before returning
   
   while [ $retries -le 30 ]; do
      $bin_dir/qping -info $masterhost $ping_port qmaster 1 > /dev/null 2>&1
      if [ $? -eq 0 ]; then
         running=true
         break
      else
         now=`$SGE_ROOT/utilbin/$ARCH/now 2>/dev/null`
         if [ "$now" -lt "$start" ]; then
            start=$now
         fi
         elapsed=`expr $now - $start`
         if [ $elapsed -gt 60 ]; then
            if [ $retries -eq 0 ]; then
               qping_timeout=true
            fi
            break
         fi
         sleep 2
         masterhost=`cat $SGE_ROOT/$SGE_CELL/common/act_qmaster`
         retries=`expr $retries + 1`
      fi
   done

   if [ $running = "true" ]; then
      if [ `CheckIfQmasterHost $HOST` = false ]; then
         echo "sge_qmaster is running on another host (${masterhost})"
         return 1
      else
         return 0
      fi
   else
      echo
      echo "sge_qmaster start problem"
      if [ $qping_timeout = true ]; then
         echo "Possibly a network or hostname configuration problem (got timeout)."
      fi
      echo
      return 1
   fi
}

#---------------------------------------------------------------------------
# DetectSMFService - sets service to a mask matching the name
# $1 ... name
#
DetectSMFService()
{
   name=$1
   service=""

   if [ "$noSMF" = true -o $NO_SMF -ne 0 ]; then
      return
   fi

   #Check we have cluster_name file
   if [ ! -r "$SGE_ROOT/$SGE_CELL/common/cluster_name" ]; then
      echo "Error: could not find $SGE_ROOT/$SGE_CELL/common/cluster_name!"
      exit $SMF_EXIT_ERR_CONFIG
   fi
   #Cluster name must be unique
   SGE_CLUSTER_NAME=`cat $SGE_ROOT/$SGE_CELL/common/cluster_name 2>/dev/null`
   
   service="svc:/application/sge/$name:$SGE_CLUSTER_NAME"

   #Check if service exists
   /usr/bin/svcs $service > /dev/null 2>&1
   if [ $? -ne 0 ]; then
      #No such service found in the system
      service=""
   fi
}


#---------------------------------------------------------------------------
usage()
{
   echo "Grid Engine start/stop script. Valid parameters are:"
   echo ""
   echo "   (no parameters): start qmaster and shadow daemon if applicable"
   echo "   \"start\"        ditto."
   echo "   \"stop\"         shut down qmaster and shadow daemon if applicable"
   echo "   \"restart\"      restart (stop and start) daemons"
   echo "   \"-qmaster\"     only start/stop qmaster (if applicable)"
   echo "   \"-shadowd\"     only start/stop shadowd (if applicable)"   
   echo "   \"-migrate\"     shutdown qmaster if it's running on another"
   echo "                    host and restart it on this host"
   echo "                    Migration only works if this host is an admin host"
   echo "   \"-nosmf\"       force no SMF"
   echo ""
   echo "Only one of \"start\", \"stop\", or \"restart\" is allowed."
   echo "Only one of the parameters beginning  with \"-\" is allowed. Does not " 
   echo "apply to -nosmf."
   echo
   echo "Default argument is \"start\" for all components."
   echo "Default for \"stop\" is shutting down all components."
   echo
   exit 1
}


#---------------------------------------------------------------------------
# MAIN Procedure
#

if [ "$#" -gt 3 -o "$1" = "-h" -o "$1" = "help" ]; then
   usage
fi

startup=true
qmaster=true
shadowd=true
qstd=false
migrate_qmaster=false
noSMF=false
stop=false

for i in $*; do
   if [ "$i" = start ]; then
      startup=true
   elif [ "$i" = stop ]; then
      startup=false
      stop=true
   elif [ "$i" = restart ]; then
       stop=true
       startup=true
   elif [ "$i" = -qmaster ]; then
      qmaster=true
      shadowd=false
   elif [ "$i" = -shadowd ]; then
      qmaster=false
      shadowd=true
   elif [ "$i" = -migrate ]; then
      migrate_qmaster=true
      qmaster=true
      shadowd=false
   elif [ "$i" = -nosmf ]; then
      noSMF=true
   else
      usage
   fi
done

bin_dir=`GetPathToBinaries`
if [ "$bin_dir" = "none" ]; then
   echo "can't determine path to Grid Engine binaries"
   exit 5	# LSB compliant exit status - program is not installed
fi

utilbin_dir=`GetPathToUtilbin`
if [ "$utilbin_dir" = "none" ]; then
   echo "can't determine path to Grid Engine utility binaries"
   exit 5	# LSB compliant exit status - program is not installed
fi

HOST=`$utilbin_dir/gethostname -aname`
UQHOST=`$utilbin_dir/gethostname -aname | cut -f1 -d.`
qmaster_spool_dir=`QmasterSpoolDir`
CheckIfShadowMasterHost $HOST

if [  "$stop" = true ]; then
   if [ $shadowd = true -a $shadow_host = true ]; then
      echo "   shutting down Grid Engine shadowd"
      DetectSMFService shadowd
      if [ \( -z "$SMF_FMRI" -o "$SMF_FMRI" != "$service" \) -a -n "$service" ]; then
         svcadm disable -st $service
      else
         # Send SIGTERM to shadowd
         if [ -f $qmaster_spool_dir/shadowd_$UQHOST.pid ]; then
            Shutdown sge_shadowd $qmaster_spool_dir/shadowd_$UQHOST.pid
         elif [ -f $qmaster_spool_dir/shadowd_$HOST.pid ]; then
            Shutdown sge_shadowd $qmaster_spool_dir/shadowd_$HOST.pid
         fi	
      fi
   fi

   if [ $qmaster = true ]; then
      if [ `CheckIfQmasterHost $HOST` = true ]; then
         echo "   shutting down Grid Engine qmaster"
         DetectSMFService qmaster
         if [ \( -z "$SMF_FMRI" -o "$SMF_FMRI" != "$service" \) -a -n "$service" ]; then
            svcadm disable -st $service
            exit $?
         else
            # Send SIGTERM to qmaster
            Shutdown sge_qmaster $qmaster_spool_dir/qmaster.pid
            ret=$?
            if [ -f /var/lock/subsys/sgemaster ]; then
               uid=`$utilbin_dir/uidgid -uid`
               if [ "$uid" = "0" -a "$ret" = "0" ]; then            
                  rm -f /var/lock/subsys/sgemaster >/dev/null 2>&1
               else
                  echo "Can't shut down qmaster!"
                  exit 1
               fi
            fi
         fi
      fi
   fi
fi

if [ "$startup" = true ]; then
   
   # qmaster_host=true if qmaster was running on this host the last time
   #                   this host is an execution host

   qmaster_host=`CheckIfQmasterHost $HOST`
   primary_qmaster_host=`CheckIfPrimaryQmasterHost $HOST`

   if [ $qmaster = true -a $qmaster_host = true -a $migrate_qmaster = true ]; then
      echo "   qmaster running on this host. Will not migrate qmaster."
      exit 1
   fi

   if [ $qmaster = true -a $qmaster_host = false -a  \
        \( $primary_qmaster_host = true -o $migrate_qmaster = true \) ]; then
       actual_qmaster_host=`cat $SGE_ROOT/$SGE_CELL/common/act_qmaster`
       echo "   shutting down qmaster on host \"$actual_qmaster_host\" ..."
       qconf_output=`$bin_dir/qconf -ks 2>&1 | grep "denied"`
       if [ "$qconf_output" != "" ]; then
          echo "   denied: host \"$HOST\" is not an admin host."
          exit 1
       fi
       $bin_dir/qconf -km > /dev/null 2>&1
       
       qping_count=0
       qping_retries=10
       qping_exit_state=0
       if [ "$SGE_QMASTER_PORT" = "" ]; then
          ping_port=`$utilbin_dir/getservbyname -number sge_qmaster`
       else
          ping_port=$SGE_QMASTER_PORT
       fi
       while [ $qping_count -lt $qping_retries ]; do
          $bin_dir/qping -info $actual_qmaster_host $ping_port qmaster 1  > /dev/null 2>&1
          qping_exit_state=$?
          if [ $qping_exit_state -ne 0 ]; then
             break
          fi
          sleep 3
          qping_count=`expr $qping_count + 1`
       done

       if [ $qping_exit_state -eq 0 ]; then
       #  qmaster is still running
          echo "   qmaster on host $actual_qmaster_host still alive. Cannot migrate qmaster."
          exit 1
       fi

       lock_file_read_retries=10
       lock_file_read_count=0
       lock_file_found=0
       while [ $lock_file_read_count -lt $lock_file_read_retries ]; do
          if [ -f $qmaster_spool_dir/lock ]; then
             lock_file_found=1
             break
          fi
          sleep 3
          lock_file_read_count=`expr $lock_file_read_count + 1`
       done

       if [ $lock_file_found -eq 0 ]; then
       #  old qmaster did not write lock file 
          echo "   old qmaster did not write lock file. Cannot migrate qmaster."
          echo "   Please verify that qmaster on host $actual_qmaster_host is down"
          echo "   and make sure that the lock file in qmaster spool directory is"
          echo "   read-able."
          exit 1
       fi

       qmaster_host=true
       #If we use SMF, we need to notify the SMF service
       DetectSMFService qmaster
       if [ -n "$service" ]; then
          svccfg -s $service setenv MIGRATE_SMF_STEP true
          if [ $? -ne 0 ]; then
             echo "Migration failed!"
             echo "It seems you do not have permission to modify the $service SMF service."
             exit 1
          else
             svcadm refresh $service
          fi
       fi
   fi

   exit_val=0
   
   #Need to check if this is a SMF migration
   DetectSMFService qmaster
   if [ -n "$SMF_FMRI" -a "$SMF_FMRI" = "$service" -a "$MIGRATE_SMF_STEP" = true ]; then
      qmaster_host=true
   fi

   if [ $qmaster = true -a $qmaster_host = false ]; then
      echo
      echo "sge_qmaster didn't start!"
      echo "This is not a qmaster host!"
      echo "Check your ${SGE_ROOT}/${SGE_CELL}/common/act_qmaster file!" 
      echo
      if [ $shadowd = false -o ! -f $SGE_ROOT/$SGE_CELL/common/shadow_masters ]; then
         exit 1
      fi
   elif [ $qmaster = true ]; then
      already_running=false
      #Check if pid file exists      
      if [ -s "$qmaster_spool_dir/qmaster.pid" ]; then
         daemon_pid=`cat "$qmaster_spool_dir/qmaster.pid"`
         $utilbin_dir/checkprog $daemon_pid sge_qmaster > /dev/null
         if [ $? -eq 0 ]; then
            already_running=true
         fi
      fi
      # We can't detect pid file race, but we'll catch it most of the time
      if [ "$already_running" = "true" ]; then
         echo
         echo "sge_qmaster with PID $daemon_pid is already running"
         echo
      else
         #We want to use smf
         if [ \( -z "$SMF_FMRI" -o "$SMF_FMRI" != "$service" \) -a -n "$service" ]; then
            echo "   starting sge_qmaster"
            svcadm enable -st $service
            exit_val=$?
         #For -migrate with SMF qmaster_host is not yet set for SMF start (2nd)
         elif [ $qmaster_host = true -o \( -n "$SMF_FMRI" -a "$SMF_FMRI" = "$service" \) ]; then
            echo "   starting sge_qmaster"
            $bin_dir/sge_qmaster
            [ $? -eq 0 -a -d /var/lock/subsys ] && touch /var/lock/subsys/sgemaster >/dev/null 2>&1
            CheckRunningQmaster
            exit_val=$?
            if [ $exit_val -eq 0 -a -n "$SMF_FMRI" -a "$SMF_FMRI" = "$service" -a "$MIGRATE_SMF_STEP" = true ]; then
               svccfg -s $service unsetenv MIGRATE_SMF_STEP
               if [ $? -ne 0 ]; then
                  echo "Warning: SMF migration cleanup step failed!"
                  echo "It seems you do not have permission to modify the $service SMF service."
                  echo
                  echo "Run following commands manually as root or appropriate user:"
                  echo "svccfg -s $service unsetenv MIGRATE_SMF_STEP"
                  echo "svcadm refresh $service"
               else
                  svcadm refresh $service
               fi
            fi
         fi
         if [ $exit_val -ne 0 ]; then
            echo "sge_qmaster didn't start!"
         fi
      fi
   fi

   if [ $shadowd = true -a $shadow_host = false ]; then
      #Display the message only if we have installed any shadowds
      if [ -f $SGE_ROOT/$SGE_CELL/common/shadow_masters ]; then
         echo
         echo "sge_shadowd didn't start!"
         echo "This is not a shadow master host!"
         echo "Check your ${SGE_ROOT}/${SGE_CELL}/common/shadow_masters file!"
         echo
      elif [ $qmaster = false ]; then
         #Shadow masters file does not exist and we try to start only shadowd
         echo
         echo "sge_shadowd didn't start!"
         echo "File ${SGE_ROOT}/${SGE_CELL}/common/shadow_masters does not exist!"
         echo "No shadowd installed?"
         echo
      fi
      if [ $qmaster_host = false -o $qmaster = false ]; then
         exit 1
      fi
   elif [ $shadowd = true ]; then
      start_shadowd=true
      UQpidfile=$qmaster_spool_dir/shadowd_$UQHOST.pid
      pidfile=$qmaster_spool_dir/shadowd_$HOST.pid

      if [ -f $pidfile ]; then
         pid=`cat $pidfile`
         $utilbin_dir/checkprog $pid sge_shadowd > /dev/null
         if [ "$?" = 0 ]; then
            start_shadowd=false 
         fi
      fi

      if [ -f $UQpidfile ]; then
         pid=`cat $UQpidfile`
         $utilbin_dir/checkprog $pid sge_shadowd > /dev/null
         if [ "$?" = 0 ]; then
            start_shadowd=false
         fi
      fi

      if [ $start_shadowd = true ]; then
         DetectSMFService shadowd
         echo "   starting sge_shadowd"
         #We want to use smf
         if [ \( -z "$SMF_FMRI" -o "$SMF_FMRI" != "$service" \) -a -n "$service" ]; then
            svcadm enable -st $service
            res=$?
         else
            $bin_dir/sge_shadowd
            res=$?
         fi
         if [ $res -ne 0 ]; then
            echo "   sge_shadowd didn't start correctly!"
            exit $res
         fi
      else
         echo "   found running sge_shadowd - not starting"
      fi
   fi
      
   if [ $exit_val -ne 0 ]; then
      exit $exit_val
   fi
fi
